Set up species and input files

Using PLAZA ortholog groups from PlantGenIE inferred using OrthoMCL: ftp://plantgenie.org/Data/Cross-Species/Orthologs/PLAZA/orthologs.ORTHO.plantgenie.txt.gz

Species names: from PLAZA e.g. “potri” and “piabi”

Expression files: tab-separated file where the first column contains gene names and is called “Genes”. Here we use data from AspWood and NorWood.

# Species name in Plaza
species1_name <- "potri"
species2_name <- "piabi"

# Expression files
species1_expr_file <- "Data/AspWood_expression.txt"
species2_expr_file <- "Data/NorWood_expression.txt"

# Ortholog groups
ortholog_group_file <- "Data/orthologs.ORTHO.plantgenie.txt.gz"

Read in ortholog groups and expression data

# Read in ortholog groups
# =======================

# Reading in and parsing the PLAZA file takes time. Parsed results are saved
# and used if they exists.
ortholog_group_RData <- paste0("RData/orthologs-", species1_name, "-", species2_name, ".RData")

if (!file.exists(ortholog_group_RData)) {
  ortho <- read.delim(ortholog_group_file)
  
  ortho <- ortho %>%
    filter(species == species1_name) %>%
    group_by(gene_content) %>%
    mutate(OrthoGroup = cur_group_id()) %>%
    ungroup() %>%
    separate_rows(gene_content, sep = ";", convert = FALSE) %>%
    filter(grepl(species2_name, gene_content)) %>%
    separate(gene_content, into = c("prefix", "gene_content"), sep = ":") %>%
    separate_rows(gene_content, sep = ",", convert = FALSE) %>%
    mutate(Species1 = gene_id, Species2 = gene_content) %>%
    select(Species1, Species2, OrthoGroup)
  
  save(ortho, file = ortholog_group_RData)
} else {
  load(file = ortholog_group_RData)
}

# Read in expression data
# =======================

species1_expr <- read.delim(species1_expr_file, sep = "\t", header = TRUE)

species2_expr <- read.delim(species2_expr_file, sep = "\t", header = TRUE)

# Filter
# ======

cat (length(unique(ortho$OrthoGroup)), " ortholog groups containing:\n",
     " ", length(unique(ortho$Species1)), " ", species1_name, " genes\n",
     " ", length(unique(ortho$Species2)), " ", species2_name, " genes\n\n",
     length(unique(species1_expr$Genes)), " expressed ", species1_name, " genes\n",
     length(unique(species2_expr$Genes)), " expressed ", species2_name, " genes\n",
     sep = "")
## 8235 ortholog groups containing:
##  29158 potri genes
##  35245 piabi genes
## 
## 28295 expressed potri genes
## 18514 expressed piabi genes
ortho <- ortho %>%
  filter(Species1 %in% species1_expr$Genes & Species2 %in% species2_expr$Genes)

species1_expr <- species1_expr[species1_expr$Genes %in% ortho$Species1,]
species2_expr <- species2_expr[species2_expr$Genes %in% ortho$Species2,]

cat ("After filtering on expressed genes with ortholog:\n",
     " ", length(unique(ortho$OrthoGroup)), " ortholog groups containing: \n",
     "  ", length(unique(ortho$Species1)), " ", species1_name, " genes\n",
     "  ", length(unique(ortho$Species2)), " ", species2_name, " genes\n",
     sep = "")
## After filtering on expressed genes with ortholog:
##  6574 ortholog groups containing: 
##   19332 potri genes
##   12872 piabi genes

Compute co-expression networks

There are several parameters that need to be set:

cor_method <- "pearson" # pearson spearman
cor_sign <- "" # abs
norm_method <- "MR" # CLR MR
density_thr <- 0.03
randomize <- "" # rand

cat ("Parameters", "\n", 
     "  cor_method  = ", cor_method, "\n",
     "  norm_method = ", norm_method, "\n",
     "  density_thr = ", density_thr, "\n", sep = "")
## Parameters
##   cor_method  = pearson
##   norm_method = MR
##   density_thr = 0.03
comparison_RData <- paste0("RData/comparison-", species1_name, "-", species2_name, "-", 
                           cor_sign, cor_method, norm_method, density_thr, randomize, ".RData")

if (!file.exists(comparison_RData)) {
  
  if (randomize == "rand") {
    species1_expr$Genes <- sample(species1_expr$Genes, nrow(species1_expr), FALSE)
    species2_expr$Genes <- sample(species2_expr$Genes, nrow(species2_expr), FALSE)
  }
  
  species1_net <- cor(t(species1_expr[,-1]), method = cor_method)
  dimnames(species1_net) <- list(species1_expr$Genes, species1_expr$Genes)
  
  species2_net <- cor(t(species2_expr[,-1]), method = cor_method)
  dimnames(species2_net) <- list(species2_expr$Genes, species2_expr$Genes)
  
  if (cor_sign == "abs") {
    species1_net <- abs(species1_net)
    species2_net <- abs(species2_net)
  }
  
  if (norm_method == "CLR") {
    
    z <- scale(species1_net)
    z[z < 0] <- 0
    species1_net <- sqrt(t(z)**2 + z**2)
    
    z <- scale(species2_net)
    z[z < 0] <- 0
    species2_net <- sqrt(t(z)**2 + z**2)
    
  } else if (norm_method == "MR") {
    R <- t(apply(species1_net, 1, rank))
    species1_net <- sqrt(R * t(R))
    
    R <- t(apply(species2_net, 1, rank))
    species2_net <- sqrt(R * t(R))
  }
  
  diag(species1_net) <- 0
  diag(species2_net) <- 0
  
  R <- sort(species1_net[upper.tri(species1_net, diag = FALSE)], decreasing = TRUE)
  species1_thr <- R[round(density_thr*length(R))]
  plot(density(R), xlab = paste0(species1_name, " correlations"), main = "")
  
  R <- sort(species2_net[upper.tri(species2_net, diag = FALSE)], decreasing = TRUE)
  species2_thr <- R[round(density_thr*length(R))]
  plot(density(R), xlab = paste0(species2_name, " correlations"), main = "")
  
} else {
  load(file = comparison_RData)
}

cat(species1_name, ": co-expr threshold = ", format(species1_thr, digits = 3) , "\n",
    species2_name, ": co-expr threshold = ", format(species2_thr, digits = 3) , "\n", sep = "")
## potri: co-expr threshold = 18546
## piabi: co-expr threshold = 12397

Network comparison

For each ortholog pair, p-values for for the overlap of network neighborhoods are computed in both direction.

if (!file.exists(comparison_RData)) {

  comparison <- ortho
  
  comparison$Species1.neigh <- c(NA)
  comparison$Species1.ortho.neigh <- c(NA)
  comparison$Species1.neigh.overlap <- c(NA)
  comparison$Species1.p.val <- c(NA)
  comparison$Species1.effect.size <- c(NA)
  
  comparison$Species2.neigh <- c(NA)
  comparison$Species2.ortho.neigh <- c(NA)
  comparison$Species2.neigh.overlap <- c(NA)
  comparison$Species2.p.val <- c(NA)
  comparison$Species2.effect.size <- c(NA)
  
  for (i in 1:nrow(ortho)) {
    
    if (i %% 1000 == 0) {
      cat(i, "/", nrow(ortho), "\n")
    }
    
    # Species 1 -> Species 2
    
    neigh <- species1_net[ortho$Species1[i],]
    neigh <- names(neigh[neigh >= species1_thr])
    
    ortho_neigh <- species2_net[ortho$Species2[i],]
    ortho_neigh <- names(ortho_neigh[ortho_neigh >= species2_thr])
    ortho_neigh <- unique(ortho$Species1[ortho$Species2 %in% ortho_neigh])
    
    N <- nrow(species1_expr)
    m <- length(neigh)
    n <- N-m
    k <- length(ortho_neigh)
    x <- length(intersect(neigh, ortho_neigh))
    p_val <- 1
    effect_size <- 1
    if (x > 1) {
      p_val <- phyper(x-1, m, n, k, lower.tail = FALSE)
      effect_size <- (x/k) / (m/N)
    }
    
    comparison$Species1.neigh[i] <- m
    comparison$Species1.ortho.neigh[i] <- k
    comparison$Species1.neigh.overlap[i] <- x
    comparison$Species1.p.val[i] <- p_val
    comparison$Species1.effect.size[i] <- effect_size
    
    # Species 2 -> Species 1
    
    neigh <- species2_net[ortho$Species2[i],]
    neigh <- names(neigh[neigh >= species2_thr])
    
    ortho_neigh <- species1_net[ortho$Species1[i],]
    ortho_neigh <- names(ortho_neigh[ortho_neigh >= species1_thr])
    ortho_neigh <- unique(ortho$Species2[ortho$Species1 %in% ortho_neigh])
    
    N <- nrow(species2_expr)
    m <- length(neigh)
    n <- N-m
    k <- length(ortho_neigh)
    x <- length(intersect(neigh, ortho_neigh))
    p_val <- 1
    effect_size <- 1
    if (x > 1) {
      p_val <- phyper(x-1, m, n, k, lower.tail = FALSE)
      effect_size <- (x/k) / (m/N)
    }
    
    comparison$Species2.neigh[i] <- m
    comparison$Species2.ortho.neigh[i] <- k
    comparison$Species2.neigh.overlap[i] <- x
    comparison$Species2.p.val[i] <- p_val
    comparison$Species2.effect.size[i] <- effect_size
  }
  
  save(comparison, species1_thr, species2_thr, file = comparison_RData)
}

# Filter orthologs not in the networks
comparison <- comparison %>%
  filter(Species1.neigh.overlap > 0 & Species2.neigh.overlap > 0)

# FDR correction
comparison$Species1.p.val <- p.adjust(comparison$Species1.p.val, method = "fdr")
comparison$Species2.p.val <- p.adjust(comparison$Species2.p.val, method = "fdr")

cat ("After filtering on gene pairs in the networks:\n",
     " ", length(unique(comparison$OrthoGroup)), " ortholog groups containing: \n",
     "  ", length(unique(comparison$Species1)), " ", species1_name, " genes\n",
     "  ", length(unique(comparison$Species2)), " ", species2_name, " genes\n",
     sep = "")
## After filtering on gene pairs in the networks:
##  6547 ortholog groups containing: 
##   19201 potri genes
##   12835 piabi genes
# Comparison of p-values of orthologs: species 1 -> species 2 vs species 2 -> species 1
R <- cor.test(-log10(comparison$Species1.p.val), -log10(comparison$Species2.p.val))

data.frame(s1 = -log10(comparison$Species1.p.val),
           s2 = -log10(comparison$Species2.p.val)) %>%
  ggplot(aes(x = s1, y = s2)) +
  xlab(paste0(species1_name, " p-value (-log10)")) +
  ylab(paste0(species2_name, " p-value (-log10)")) +
  geom_point() + 
  geom_smooth(method=lm, formula = y ~ x, fill = "gainsboro") +
  ggtitle(paste0("Correlation = ", format(R$estimate, digits = 3)))

# Comparison of p-values and effect sizes
comparison %>%
  rowwise() %>%
  mutate(Max.p.val = max(Species1.p.val, Species2.p.val)) %>%
  mutate(Min.effect.size = min(Species1.effect.size, Species2.effect.size)) %>%
  mutate(NeighborhoodSize = mean(c(Species1.neigh,Species2.neigh))) %>% 
  filter(Max.p.val < 0.05) %>% 
  ggplot(aes(x = -log10(Max.p.val), y = Min.effect.size, col = NeighborhoodSize)) +
  xlab("P-value") +
  ylab("Effect size") +
  geom_point()

# Print some summary statistics
# =============================

# Gene pairs

cat("GENE PAIRS:\n",
    sum(comparison$Species1.p.val< 0.05), " conserved ", species1_name, " gene pairs (", 
    format(sum(comparison$Species1.p.val< 0.05)/nrow(comparison), digits = 3), ")\n", 
    sum(comparison$Species2.p.val< 0.05), " conserved ", species2_name, " gene pairs (", 
    format(sum(comparison$Species2.p.val< 0.05)/nrow(comparison), digits = 3), ")\n",
    sum(comparison$Species1.p.val< 0.05 & comparison$Species2.p.val< 0.05), " reciprocally conserved gene pairs (", 
    format(sum(comparison$Species1.p.val< 0.05 & comparison$Species2.p.val< 0.05)/nrow(comparison), digits = 3), ")\n", sep = "")
## GENE PAIRS:
## 31968 conserved potri gene pairs (0.366)
## 35765 conserved piabi gene pairs (0.409)
## 25105 reciprocally conserved gene pairs (0.287)
# Genes

comparison_species1 <- comparison %>%
  group_by(Species1) %>%
  arrange(Species1.p.val) %>%
  slice(1)

comparison_species2 <- comparison %>%
  group_by(Species2) %>%
  arrange(Species2.p.val) %>%
  slice(1)

comparison_species1_12 <- comparison %>%
  rowwise() %>%
  mutate(Max.p.val = max(Species1.p.val, Species2.p.val)) %>%
  group_by(Species1) %>%
  arrange(Max.p.val) %>%
  slice(1)

comparison_species2_12 <- comparison %>%
  rowwise() %>%
  mutate(Max.p.val = max(Species1.p.val, Species2.p.val)) %>%
  group_by(Species2) %>%
  arrange(Max.p.val) %>%
  slice(1)

cat("GENES:\n",
    sum(comparison_species1$Species1.p.val< 0.05), " conserved ", species1_name, " genes (", 
    format(sum(comparison_species1$Species1.p.val< 0.05)/nrow(comparison_species1), digits = 3), ")\n", 
    sum(comparison_species2$Species2.p.val< 0.05), " conserved ", species2_name, " genes (", 
    format(sum(comparison_species2$Species2.p.val< 0.05)/nrow(comparison_species2), digits = 3), ")\n",
    sum(comparison_species1_12$Max.p.val< 0.05), " reciprocally conserved ", species1_name, " genes (", 
    format(sum(comparison_species1_12$Max.p.val< 0.05 & 
               comparison_species1_12$Max.p.val< 0.05)/nrow(comparison_species1_12), digits = 3), ")\n",
    sum(comparison_species2_12$Max.p.val< 0.05), " reciprocally conserved ", species2_name, " genes (", 
    format(sum(comparison_species2_12$Max.p.val< 0.05 & 
               comparison_species2_12$Max.p.val< 0.05)/nrow(comparison_species2_12), digits = 3), ")\n",
    sep = "")
## GENES:
## 9528 conserved potri genes (0.496)
## 6981 conserved piabi genes (0.544)
## 8163 reciprocally conserved potri genes (0.425)
## 5858 reciprocally conserved piabi genes (0.456)
# Orthogroups

comparison_species1 <- comparison %>%
  group_by(OrthoGroup) %>%
  arrange(Species1.p.val) %>%
  slice(1)

comparison_species2 <- comparison %>%
  group_by(OrthoGroup) %>%
  arrange(Species2.p.val) %>%
  slice(1)

comparison_species12 <- comparison %>%
  rowwise() %>%
  mutate(Max.p.val = max(Species1.p.val, Species2.p.val)) %>%
  group_by(OrthoGroup) %>%
  arrange(Max.p.val) %>%
  slice(1)

cat("ORTHOLOG GROUPS:\n",
    sum(comparison_species1$Species1.p.val< 0.05), " conserved ", species1_name, " orthogroups (", 
    format(sum(comparison_species1$Species1.p.val< 0.05)/nrow(comparison_species1), digits = 3), ")\n", 
    sum(comparison_species2$Species2.p.val< 0.05), " conserved ", species2_name, " orthogroups (", 
    format(sum(comparison_species2$Species2.p.val< 0.05)/nrow(comparison_species2), digits = 3), ")\n",
    sum(comparison_species12$Max.p.val< 0.05), " reciprocally conserved orthogroups (", 
    format(sum(comparison_species12$Max.p.val< 0.05 & 
               comparison_species12$Max.p.val< 0.05)/nrow(comparison_species12), digits = 3), ")\n", sep = "")
## ORTHOLOG GROUPS:
## 3630 conserved potri orthogroups (0.554)
## 3582 conserved piabi orthogroups (0.547)
## 3013 reciprocally conserved orthogroups (0.46)

Table of all comparisons.

potri_annot <- read.delim("data/Ptrichocarpa_210_v3.0.annotation_info.txt.gz", header = F, sep = "\t")
potri_annot <- potri_annot[, c("V2", "V12", "V13")]
colnames(potri_annot) <- c("Species1", "Symbol", "Description")
potri_annot <- potri_annot %>%
  group_by(Species1) %>%
  slice(1)

comparison_table <- comparison %>%
  rowwise() %>%
  mutate(Max.p.val = max(Species1.p.val, Species2.p.val)) %>%
  mutate(Min.effect.size = min(Species1.effect.size, Species2.effect.size)) %>%
  left_join(potri_annot, by = "Species1") %>%
  select(-c("Species1.neigh", "Species1.ortho.neigh", "Species2.neigh", "Species2.ortho.neigh")) %>%
  arrange(Max.p.val)
  
comparison_table$Species1.p.val <- format(comparison_table$Species1.p.val, digits = 3, scientific = TRUE)
comparison_table$Species1.effect.size <- format(comparison_table$Species1.effect.size, digits = 1, scientific = FALSE)
comparison_table$Species2.p.val <- format(comparison_table$Species2.p.val, digits = 3, scientific = TRUE)
comparison_table$Species2.effect.size <- format(comparison_table$Species2.effect.size, digits = 1, scientific = FALSE)
comparison_table$Max.p.val <- format(comparison_table$Max.p.val, digits = 3, scientific = TRUE)
comparison_table$Min.effect.size <- format(comparison_table$Min.effect.size, digits = 1, scientific = FALSE)

datatable(comparison_table, rownames = FALSE, filter = "top",
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = "_all"))
            )
          )